library(here) # A Simpler Way to Find Your Files, CRAN v1.0.1
library(tidyverse) # Easily Install and Load the 'Tidyverse', CRAN v1.3.0
library(qiime2R) # Import qiime2 artifacts to R, [github::jbisanz/qiime2R] v0.99.35
library(phyloseq) # Handling and analysis of high-throughput microbiome census data, Bioconductor v1.34.0
library(microbiome) # Microbiome Analytics, Bioconductor v1.12.0
library(picante) # Integrating Phylogenies and Ecology, CRAN v1.8.2
library(MicrobeR) # Handy functions for microbiome analysis in R, [github::jbisanz/MicrobeR] v0.3.2
library(mixOmics) # Omics Data Integration, Bioconductor v6.14.0
library(RUVSeq) # Remove Unwanted Variation from RNA-Seq Data, Bioconductor v1.24.0
library(sva) # Surrogate Variable Analysis, Bioconductor v3.38.0
library(cowplot) # Streamlined Plot Theme and Plot Annotations for 'ggplot2', CRAN v1.1.1
library(ggpubr) # 'ggplot2' Based Publication Ready Plots, CRAN v0.4.0
library(ggh4x) # Hacks for 'ggplot2', CRAN v0.1.2.1
library(plotly) # Create Interactive Web Graphics via 'plotly.js', CRAN v4.9.3
library(RColorBrewer) # ColorBrewer Palettes, CRAN v1.1-2
library(vegan) # Community Ecology Package, CRAN v2.5-7
# metadata
mtd <- read_tsv(here("data/metadata.tsv"), comment = "#q2") %>%
rename(SampleID = "#SampleID") %>%
column_to_rownames("SampleID") %>%
mutate(PCRBatch = factor(PCRBatch),
Diet = factor(Diet, levels = c("REF", "IM")),
Compartment = factor(Compartment, levels = c("PI", "DI")),
SampleOrigin = factor(
SampleOrigin,
levels = c("Feed", "Water", "Digesta", "Mucosa",
"Mock", "DNA-extraction", "Amplicon-PCR")),
SampleType = factor(
SampleType,
levels = c("REF-PID", "REF-PIM", "REF-DID", "REF-DIM",
"IM-PID", "IM-PIM", "IM-DID", "IM-DIM",
"Feed", "Water", "Extraction-blank", "PCR-blank", "Mock")))
# feature table
otu <- read_qza(here("data/intermediate/qiime2/97otu/table-filtered-97otu-sepp-inserted.qza")) %>%
pluck("data") %>%
as("matrix")
# taxonomy
txnm <- read_qza(here("data/intermediate/qiime2/asv/taxonomy-silva132.qza"))
txnm <- txnm$data %>%
as.data.frame() %>%
mutate(Taxon = gsub("D_0", "k", Taxon), Taxon = gsub("D_1", "p", Taxon),
Taxon = gsub("D_2", "c", Taxon), Taxon = gsub("D_3", "o", Taxon),
Taxon = gsub("D_4", "f", Taxon), Taxon = gsub("D_5", "g", Taxon),
Taxon = gsub("D_6", "s", Taxon)) %>%
separate(Taxon, sep = ";", c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")) %>%
column_to_rownames("Feature.ID") %>%
select(-Confidence)
# phylogeny
tree <- read_qza(here("data/intermediate/qiime2/97otu/insertion-tree-97otu.qza"))
# assemble a phyloseq object
ps <- phyloseq(sample_data(mtd),
otu_table(otu, taxa_are_rows = TRUE),
tax_table(as.matrix(txnm)),
phy_tree(tree$data))
# change OTU names
indx <- formatC(1:ntaxa(ps), width = nchar(ntaxa(ps)), format = "d", flag = "0")
taxa_names(ps) <- paste0("OTU", indx)
# filter data
ps <- subset_samples(ps, !SampleType %in% c("Extraction-blank", "PCR-blank", "Mock")) # remove control samples
# extract metadata, otu table, taxonomy and phylogeny from phyloseq
mtd <- data.frame(sample_data(ps), check.names = FALSE)
otu <- as.data.frame(otu_table(ps)) %>% t()
txnm <- tax_table(ps) %>% as("matrix") %>% as.data.frame()
tree <- phy_tree(ps)
# Centered log-ratio transformation
otu_clr <- logratio.transfo(otu, logratio = "CLR", offset = 1 ) %>% as("matrix")
# PCA
pca_before <- pca(otu_clr, ncomp = 3)
# PCA plot
pcaPlot_before <- cbind(mtd, pca_before$variates$X) %>%
ggplot(aes(x = PC1, y = PC2, color = SampleType, shape = PCRBatch)) +
geom_point(size = 2) +
geom_line(aes(group = SampleName), color = "black") + # connect paired technical replicates by line
labs(title = "Before batch correction", color = "Sample type", shape = "PCR batch",
x = paste0("PC1: ", round(100 * pca_before$explained_variance[1], 2), "%"),
y = paste0("PC2: ", round(100 * pca_before$explained_variance[2], 2), "%")) +
scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
theme_bw()
pcaPlot_before
From the PCA plot, we can tell that technical replicate samples are far apart. Mucosa samples from fish fed the REF diet form 2 clusters based on PCR batches. There’s a strong PCR batch effect.
RUVSeq removes unwanted variation in RNASeq data using replicate samples and negative control genes. Here we use it for correcting batch effects in microbiome data. The two main parameters of RUVSeq are the number of factors of unwanted variation, k, and the set of negative control variables genes, or OTUs in our case. The choice of the parameter k is not easy and is data-dependent. Empirically, a few (2-3) factors are enough to capture the unwanted variation. In noisy data, K can be increased to 5 or 10. Note that if k is too high, the RUVSeq over-corrects for unwanted variation and ends up removing (almost) all the biological variability within the conditions. The choice of negative control variables is also somewhat data-dependent. However, RUVSeq is robust to the choice of negative control variables. One is recommend to perform extensive exploratory data analysis, comparing different values of k and sets of negative control variables.
# calculate coefficient of variation (cv) for each otu
cv <- t(otu) %>%
as.data.frame() %>%
rownames_to_column("featureID") %>%
mutate(sd = apply(.[, names(.) != "featureID"], 1, sd),
mean = apply(.[, names(.) != "featureID"], 1, mean),
cv = sd / mean) %>%
arrange(cv)
# proportions of otus used as negative controls
prp <- c(0.1, 0.2, 0.5, 1)
# number of unwanted variation factors
k <- c(1, 2, 3, 4, 5, 10)
# get combinations of k and prp for parameter tuning
prmt <- expand.grid(prp = prp, k = k)
# sample replicate
SampleType <- mtd$SampleType
names(SampleType) <- rownames(mtd)
rpl <- makeGroups(SampleType)
# batch effect correction
ruv <- map2(
prmt$prp,
prmt$k,
function(x, y)
{
# get otus showing least variance at defined proportions
min_cv <- cv[1:round(nrow(cv) * x), ] %>%
mutate(featureID = paste0("^", featureID, "$")) # exact math of otu ids
# subset otus used as negative controls
nc <- grepl(paste(min_cv$featureID, collapse = "|"), colnames(otu))
# run RUVs
ruv <- RUVs(t(otu), nc, y, rpl)
}
)
# assign names to list elements
names(ruv) <- paste0("ruv_k", prmt$k, "_nc", prmt$prp)
ComBat-seq is a batch effect correction method for RNASeq data. It is an improved model based on the popular effect correction tool, ComBat.ComBat-seq takes raw count matrix as input. Same as ComBat, it requires a known batch variable.
# PCR batch
PCRBatch <- mtd$PCRBatch
names(PCRBatch) <- rownames(mtd)
# model matrix
mod_mtrx <- model.matrix( ~ SampleType)
# run CombatSeq
cmbt <- ComBat_seq(t(otu), batch = PCRBatch, covar_mod = mod_mtrx)
## Found 3 batches
## Using null model in ComBat-seq.
## Adjusting for 9 covariate(s) or covariate level(s)
## Estimating dispersions
## Fitting the GLM model
## Shrinkage off - using GLM estimates for parameters
## Adjusting the data
Extract and log ratio transform batch effect corrected OTU table.
ruv_clr <- purrr::map(ruv, ~pluck(.x, "normalizedCounts")) %>% # get batch corrected count table
purrr::map(~t(.x)) %>% # transpose normalized count table
purrr::map(~logratio.transfo(.x, logratio = 'CLR', offset = 1)) %>% # CLR transformation
purrr::map(~as(.x, "matrix"))
Run PCA.
pca_ruv <- purrr::map(ruv_clr, ~pca(.x, ncomp = 3))
Make PCA plots.
The PCA plots with k = 4, 5 or 10 look promising. Let’s look at one of them in 3d dimensions.
# extract data
k4_nc1 <- cbind(mtd, pca_ruv$ruv_k4_nc1$variates$X)
# 3d PCA plot with plot_ly
plot_ly(
x = k4_nc1[, "PC1"], y = k4_nc1[, "PC2"], z = k4_nc1[, "PC3"],
type = "scatter3d", mode = "markers", color = k4_nc1$SampleType,
colors = "Paired") %>% #brewer.pal(n = 12, name = "Paired")
layout(scene = list(
xaxis = list(title = paste0('PC1: ', round(pca_ruv$ruv_k4_nc1$explained_variance[1]*100, 2), "%")),
yaxis = list(title = paste0('PC2: ', round(pca_ruv$ruv_k4_nc1$explained_variance[2]*100, 2), "%")),
zaxis = list(title = paste0('PC3: ', round(pca_ruv$ruv_k4_nc1$explained_variance[3]*100, 2), "%"))
)
)
# centered log-ratio transformation
cmbt_clr <- logratio.transfo(t(cmbt), logratio = 'CLR', offset = 1) %>%
as("matrix")
# PCA
pca_cmbt <- pca(cmbt_clr, ncomp = 3)
# make PCA plot
pcaPlot_cmbt <- cbind(mtd, pca_cmbt$variates$X) %>%
ggplot(aes(x = PC1, y = PC2, color = SampleType, shape = PCRBatch)) +
geom_point(size = 2) +
geom_line(aes(group = SampleName), color = "black") +
labs(title = "ComBat-Seq",
color = "Sample type", shape = "PCR batch",
x = paste0("PC1: ", round(100 * pca_cmbt$explained_variance[1], 2), "%"),
y = paste0("PC2: ", round(100 * pca_cmbt$explained_variance[2], 2), "%")) +
scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
theme_bw()
pcaPlot_cmbt
The partial redundancy analysis (pRDA) is a multivariate method to assess globally the effect of treatments and batch. Here, we run pRDA and get variance explained by treatments and batch.
# combine otu table before and after batch corrections in a list
otus <- c(before = list(otu_clr), ruv_clr, combatseq = list(cmbt_clr))
# merge otu tables and nest by batch correction methods
otus_nst <- purrr::map(otus, as.data.frame) %>%
bind_rows(.id = "batch_correction") %>%
group_nest(batch_correction, .key = "data")
# define experiment design
design <- data.frame(SampleType = rep(SampleType, length(otus)),
PCRBatch = rep(PCRBatch, length(otus)),
batch_correction = rep(names(otus), each = nrow(mtd)))
design_nst <- group_nest(design, batch_correction, .key = "design")
# run pRDA and extract variance explained by treatment and batch effects
rda <- inner_join(otus_nst, design_nst, by = "batch_correction") %>%
mutate(rda_trt = map2(data, design, ~rda(.x ~ PCRBatch + Condition(SampleType), data = .y)),
rda_bat = map2(data, design, ~rda(.x ~ SampleType + Condition(PCRBatch), data = .y)),
var_trt = map_dbl(rda_trt, ~.x$pCCA$tot.chi*100/.x$tot.chi),
var_bat = map_dbl(rda_bat, ~.x$pCCA$tot.chi*100/.x$tot.chi),
batch_correction = factor(batch_correction, levels = names(otus))) %>%
select(batch_correction, var_trt, var_bat) %>%
rename(Method = batch_correction, Treatment = var_trt, Batch = var_bat) %>%
pivot_longer(Treatment:Batch, names_to = "Type", values_to = "var_expl") %>%
mutate(var_expl = round(var_expl, 2))
Make bar plots showing variance explained by treatments and batch.
var_expl <- ggplot(rda, aes(x = fct_rev(Method), y = var_expl, fill = Type)) +
geom_bar(stat = "identity", position = 'dodge', colour = 'black') +
geom_text(aes(Method, var_expl + 5, label = var_expl), size = 3,
position = position_dodge(width = 0.9)) +
coord_flip() +
scale_y_continuous(limits = c(0, 100), expand = expansion(mult = c(0, 0))) +
labs(x = "", y = "Variance explained (%)") +
theme_bw(base_size = 12) +
guides(fill = guide_legend(reverse = TRUE))
var_expl
Based on the PCA plots and partial redundancy analysis (pRDA), we can conclude that the RUVSeq provides better batch effect corrections than the Combat-Seq does. In agreement with previous data, the RUVSeq is robust to the choice of negative control variables. Increasing the number of factors of unwanted variation, k, removes more unwanted variation. Setting k = 4, 5 or 10 yields very similar results. To avoid over-correcting batch effects, we choose the smallest K value, ie., k = 4 and nc = 1.
Get batch effect corrected OTU table and replace the original OTU table in the phyloseq object.
# extract batch effect corrected OTU table
otu_adj <- ruv$ruv_k4_nc1$normalizedCounts
# replace otu table in the phyloseq object
otu_table(ps) <- otu_table(otu_adj, taxa_are_rows = TRUE)
Make a table containing top 10 most abundant taxa at genus level.
taxa_tab <- Summarize.Taxa(otu_adj, txnm)$Genus %>% Make.Percent()
taxa_tab <- taxa_tab[order(rowMeans(taxa_tab), decreasing = T), ]
Others <- colSums(taxa_tab[11:nrow(taxa_tab), ])
taxa_tab <- rbind(taxa_tab[1:10, ], Others)
Tidy dataframe for making stacked bar plots.
taxa_tab <- as.data.frame(taxa_tab) %>%
rownames_to_column("Taxa") %>%
separate(Taxa, sep = ";", c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")) %>%
mutate(Phylum = ifelse(is.na(Phylum)|Phylum == "NA"|grepl("uncultured|Ambiguous|metagenome", Phylum),
Kingdom,
Phylum),
Class = ifelse(is.na(Class)|Class == "NA"|grepl("uncultured|Ambiguous|metagenome", Class),
Phylum,
Class),
Order = ifelse(is.na(Order)|Order == "NA"|grepl("uncultured|Ambiguous|metagenome", Order),
Class,
Order),
Family = ifelse(is.na(Family)|Family == "NA"|grepl("uncultured|Ambiguous|metagenome", Family),
Order,
Family),
Genus = ifelse(is.na(Genus)|Genus == "NA"|grepl("uncultured|Ambiguous|metagenome", Genus),
Family,
Genus)) %>%
select(-Kingdom, -(Class:Family)) %>%
pivot_longer(-c(Phylum, Genus), names_to = "SampleID", values_to = "Abundance") %>%
mutate(Phylum = gsub("p__", "", Phylum),
Genus = gsub("g__", "", Genus),
Genus = factor(Genus, levels = rev(unique(Genus)))) %>%
inner_join(rownames_to_column(mtd, "SampleID"), by = "SampleID")
Make stacked bar plots.
# define color scheme
col <- c("grey", brewer.pal(n = 10, name = "Paired"))
# water samples
taxa_bar_water <- filter(taxa_tab, SampleType == "Water") %>%
ggplot(aes(x = SampleID, y = Abundance, fill = Genus)) +
geom_bar(stat = "identity") +
labs(x = "", y = "") +
scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) +
scale_fill_manual(values = col) +
guides(fill=guide_legend(ncol=1)) +
facet_wrap(~ SampleOrigin) +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
strip.background = element_blank(),
legend.margin = margin(0, 0, 0, 0),
legend.box.margin = margin(5, -10, 5, -8))
# feed samples
taxa_bar_feed <- filter(taxa_tab, SampleType == "Feed") %>%
ggplot(aes(x = Diet, y = Abundance, fill = Genus)) +
geom_bar(stat = "identity", width = 0.5) +
labs(x = "Sample", y = "") +
scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) +
scale_fill_manual(values = col) +
facet_nested(~ SampleOrigin + Diet, scales = "free_x", nest_line = TRUE) +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
strip.background = element_blank(),
legend.position = "none")
# samples from fish fed the REF diet
taxa_bar_ref <- filter(taxa_tab, !(IsTechnicalReplicate == "yes" & PCRBatch == 2)) %>%
filter(grepl("REF-", SampleType)) %>%
ggplot(aes(x = SampleID, y = Abundance, fill = Genus)) +
geom_bar(stat = "identity") +
labs(x = "", y = "Relative abundance (%)") +
scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) +
scale_fill_manual(values = col) +
facet_nested(~ Diet + SampleOrigin + Compartment, scales = "free_x", nest_line = TRUE) +
theme_bw() +
theme(axis.text.x = element_blank(),
strip.background = element_blank(),
legend.position = "none")
# samples from fish fed the IM diet
taxa_bar_im <- filter(taxa_tab, !(IsTechnicalReplicate == "yes" & PCRBatch == 2)) %>%
filter(grepl("IM-", SampleType)) %>%
ggplot(aes(x = SampleID, y = Abundance, fill = Genus)) +
geom_bar(stat = "identity") +
labs(x = "", y = "") +
scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) +
scale_fill_manual(values = col) +
facet_nested(~ Diet + SampleOrigin + Compartment, scales = "free_x", nest_line = TRUE) +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
strip.background = element_blank(),
legend.position = "none")
# assemble plots
plot_grid(
taxa_bar_ref,
taxa_bar_feed + theme(plot.margin = margin(l = -0.5, unit = "cm")),
taxa_bar_im + theme(plot.margin = margin(l = -0.4, unit = "cm")),
taxa_bar_water + theme(plot.margin = margin(l = -0.3, unit = "cm")),
nrow = 1, align = 'h', axis = "tb",
rel_widths = c(6, 1.5, 6, 4.5))
Compared to the taxa barplot (data/intermediate/qiime2/97otu/taxa-bar-plots-filtered-97otu.qzv) before batch correction , the taxonomic composition of samples is very different after the batch correction by RUVSeq.
Compute alpha diversity indices.
# compute Observed features, Peilou's Evenness and Shannon's index
alph_npd <- alpha(ps, index = c("observed", "evenness_pielou", "diversity_shannon")) %>%
rownames_to_column("SampleID")
# compute Faith's Phylogenetic Diversity
alph_pd <- pd(samp = t(otu_table(ps)), tree = phy_tree(ps), include.root = F) %>%
select(PD) %>%
rownames_to_column("SampleID")
# combine data
alph <- inner_join(alph_npd, alph_pd, by = "SampleID") %>%
inner_join(rownames_to_column(mtd, "SampleID"), by = "SampleID") %>%
rename("Observed OTUs" = observed, "Pielou's evenness" = evenness_pielou,
"Shannon's index" = diversity_shannon, "Faith's PD" = PD) %>%
pivot_longer("Observed OTUs":"Faith's PD", names_to = "alph_indx", values_to = "value") %>%
mutate(alph_indx = factor(alph_indx, levels = c("Observed OTUs", "Pielou's evenness",
"Shannon's index", "Faith's PD")))
Alpha diversity of technical replicates.
filter(alph, IsTechnicalReplicate == "yes") %>%
ggplot(aes(x = PCRBatch, y = value)) +
geom_boxplot(aes(color = PCRBatch), width = 0.3) +
geom_point(aes(color = PCRBatch)) +
geom_line(aes(group = SampleName), linetype = "dashed") +
labs(x = "PCR batch", color = "PCR batch") +
facet_wrap(~alph_indx, scales = "free_y") +
scale_fill_brewer(palette = "Dark2") +
theme_bw()
Alpha diversity of REF-PIM and REF-DIM samples amplified in different PCR batches.
filter(alph, SampleType %in% c("REF-PIM", "REF-DIM")) %>%
ggplot(aes(x = SampleType, y = value)) +
geom_boxplot(width = 0.3) +
geom_point(aes(color = PCRBatch)) +
labs(x = "Sample type", color = "PCR batch") +
facet_wrap(~ alph_indx, scales = "free_y") +
scale_fill_brewer(palette = "Dark2") +
theme_bw()
In line with previous observations, unweighted alpha indices such as Observed OTUs and Faith’s PD are more easily influenced by technical variations than weighted alpha indices like Shannon’s index.
OTU table summary.
sampleSum <- colSums(otu_adj) %>% sort()
sampleSum
## AqFl1-134 AqFl1-031 AqFl1-062 AqFl1-065 AqFl1-091 AqFl1-073 AqFl1-087 AqFl1-033
## 873 1684 1782 1885 1888 1907 1941 1960
## AqFl1-133 AqFl1-069 AqFl1-083 AqFl1-017 AqFl1-052 AqFl1-001 AqFl1-128 AqFl1-078
## 1979 2164 2205 2210 2221 2259 2272 2287
## AqFl1-089 AqFl1-024 AqFl1-012 AqFl1-020 AqFl1-061 AqFl1-126 AqFl1-044 AqFl1-124
## 2321 2370 2452 2556 2558 2590 2665 2686
## AqFl1-002 AqFl1-058 AqFl1-059 AqFl1-004 AqFl1-094 AqFl1-053 AqFl1-014 AqFl1-074
## 2743 2776 2878 2945 2954 2987 2995 3098
## AqFl1-129 AqFl1-038 AqFl1-049 AqFl1-035 AqFl1-077 AqFl1-047 AqFl1-009 AqFl1-025
## 3140 3154 3157 3195 3271 3273 3332 3354
## AqFl1-063 AqFl1-021 AqFl1-029 AqFl1-068 AqFl1-066 AqFl1-007 AqFl1-055 AqFl1-123
## 3448 3480 3542 3591 3682 3694 3832 3908
## AqFl1-041 AqFl1-125 AqFl1-045 AqFl1-057 AqFl1-084 AqFl1-027 AqFl1-015 AqFl1-032
## 4040 4062 4087 4181 4201 4211 4251 4265
## AqFl1-127 AqFl1-081 AqFl1-088 AqFl1-072 AqFl1-080 AqFl1-042 AqFl1-013 AqFl1-095
## 4421 4423 4439 4443 4602 4605 4639 4645
## AqFl1-022 AqFl1-043 AqFl1-096 AqFl1-054 AqFl1-056 AqFl1-076 AqFl1-040 AqFl1-003
## 4694 4917 4926 4973 5100 5147 5337 5530
## AqFl1-030 AqFl1-130 AqFl1-046 AqFl1-008 AqFl1-086 AqFl1-064 AqFl1-023 AqFl1-028
## 5597 5622 5748 5849 5933 6292 6387 6621
## AqFl1-079 AqFl1-034 AqFl1-060 AqFl1-092 AqFl1-011 AqFl1-010 AqFl1-016 AqFl1-070
## 6750 6872 6926 7096 7354 7391 7429 7715
## AqFl1-039 AqFl1-051 AqFl1-019 AqFl1-050 AqFl1-082 AqFl1-018 AqFl1-071 AqFl1-026
## 7970 8849 8943 9125 9747 9874 10052 10225
## AqFl1-006 AqFl1-093 AqFl1-115 AqFl1-090 AqFl1-037 AqFl1-036 AqFl1-075 AqFl1-118
## 10926 11108 11298 11789 12187 12477 12660 13896
## AqFl1-005 AqFl1-067 AqFl1-048 AqFl1-116 AqFl1-119 AqFl1-120 AqFl1-121 AqFl1-117
## 14260 14401 16851 29419 31681 37869 38216 41612
## AqFl1-122
## 48178
We lost quite a lot of sequences after the batch effect correction. Here we rarefy OTU table at a sub-sampling depth of 1684 sequences.
ps_rf <- rarefy_even_depth(ps, sample.size = sampleSum[2], rngseed = 1910)
# pcoa analysis
ord_jac <- ordinate(ps_rf, method = "PCoA", distance = "jaccard")
# ordination plot
plot_ordination(ps_rf, ord_jac, color = "SampleType", shape = "PCRBatch") +
scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
geom_line(aes(group = SampleName), color = "black") +
geom_point(size = 3) +
labs(title = "PCoA of Jaccard distance after batch correcton by RUVSeq") +
theme_bw()
# pcoa analysis
ord_bc <- ordinate(ps_rf, method = "PCoA", distance = "bray")
# ordination plot
plot_ordination(ps_rf, ord_bc, color = "SampleType", shape = "PCRBatch") +
scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
geom_line(aes(group = SampleName), color = "black") +
geom_point(size = 3) +
labs(title = "PCoA of Bray-Curtis distance after batch correcton by RUVSeq") +
theme_bw()
Unweighted UniFrac distance.
# pcoa analysis
ord_uwuf <- ordinate(ps_rf, method = "PCoA", distance = "unifrac", weighted = FALSE)
# ordination plot
plot_ordination(ps_rf, ord_uwuf, color = "SampleType", shape = "PCRBatch") +
scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
geom_line(aes(group = SampleName), color = "black") +
geom_point(size = 3) +
labs(title = "PCoA of unweighted UniFrac distance after batch correcton by RUVSeq") +
theme_bw()
Weighted UniFrac distance.
# pcoa analysis
ord_wuf <- ordinate(ps_rf, method = "PCoA", distance = "unifrac", weighted = TRUE)
# ordination plot
plot_ordination(ps_rf, ord_wuf, color = "SampleType", shape = "PCRBatch") +
scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
geom_line(aes(group = SampleName), color = "black") +
geom_point(size = 3) +
labs(title = "PCoA of weighted UniFrac distance after batch correcton by RUVSeq") +
theme_bw()
PcoA plots based on various distance metrics show that batch effect is partly removed after the adjustment by RUVSeq.
RUVSeq largely corrected PCR batch effects. However, the batch correction removed a large proportions of sequences in many samples and resulted in very different taxonomic compositions. Hence, we’ll not correct the PCR batch effect but rather analyze the data separately or account for batch effect in the statistical models where applicable.
I’d like to thank Wang and Lê Cao for publishing the code along with their paper “Managing batch effects in microbiome data”. Part of their code is used for the present analysis.
sessionInfo()
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
## system code page: 65001
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] RColorBrewer_1.1-2 plotly_4.9.3
## [3] ggh4x_0.1.2.1 ggpubr_0.4.0
## [5] cowplot_1.1.1 sva_3.38.0
## [7] genefilter_1.72.0 mgcv_1.8-35
## [9] RUVSeq_1.24.0 edgeR_3.32.0
## [11] limma_3.46.0 EDASeq_2.24.0
## [13] ShortRead_1.48.0 GenomicAlignments_1.26.0
## [15] SummarizedExperiment_1.20.0 MatrixGenerics_1.2.0
## [17] matrixStats_0.58.0 Rsamtools_2.6.0
## [19] GenomicRanges_1.42.0 GenomeInfoDb_1.26.0
## [21] Biostrings_2.58.0 XVector_0.30.0
## [23] IRanges_2.24.0 S4Vectors_0.28.0
## [25] BiocParallel_1.24.1 Biobase_2.50.0
## [27] BiocGenerics_0.36.0 mixOmics_6.14.0
## [29] MASS_7.3-53.1 MicrobeR_0.3.2
## [31] picante_1.8.2 nlme_3.1-152
## [33] vegan_2.5-7 lattice_0.20-41
## [35] permute_0.9-5 ape_5.5
## [37] microbiome_1.12.0 phyloseq_1.34.0
## [39] qiime2R_0.99.35 forcats_0.5.1
## [41] stringr_1.4.0 dplyr_1.0.5
## [43] purrr_0.3.4 readr_1.4.0
## [45] tidyr_1.1.3 tibble_3.1.1
## [47] ggplot2_3.3.3 tidyverse_1.3.1
## [49] here_1.0.1 knitr_1.33
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 rtracklayer_1.50.0 R.methodsS3_1.8.1
## [4] bit64_4.0.5 aroma.light_3.20.0 DelayedArray_0.16.0
## [7] R.utils_2.10.1 data.table_1.14.0 rpart_4.1-15
## [10] hwriter_1.3.2 RCurl_1.98-1.3 generics_0.1.0
## [13] GenomicFeatures_1.42.0 RSQLite_2.2.7 bit_4.0.4
## [16] xml2_1.3.2 lubridate_1.7.10 assertthat_0.2.1
## [19] xfun_0.22 hms_1.0.0 jquerylib_0.1.4
## [22] evaluate_0.14 fansi_0.4.2 progress_1.2.2
## [25] dbplyr_2.1.1 readxl_1.3.1 igraph_1.2.6
## [28] DBI_1.1.1 htmlwidgets_1.5.3 rARPACK_0.11-0
## [31] ellipsis_0.3.1 crosstalk_1.1.1 RSpectra_0.16-0
## [34] backports_1.2.1 annotate_1.68.0 biomaRt_2.46.0
## [37] vctrs_0.3.6 abind_1.4-5 cachem_1.0.4
## [40] withr_2.4.1 checkmate_2.0.0 treeio_1.14.0
## [43] prettyunits_1.1.1 cluster_2.1.1 lazyeval_0.2.2
## [46] crayon_1.4.1 ellipse_0.4.2 pkgconfig_2.0.3
## [49] zCompositions_1.3.4 labeling_0.4.2 nnet_7.3-15
## [52] rlang_0.4.10 lifecycle_1.0.0 BiocFileCache_1.14.0
## [55] modelr_0.1.8 cellranger_1.1.0 rprojroot_2.0.2
## [58] Matrix_1.3-2 aplot_0.0.6 phangorn_2.6.3
## [61] carData_3.0-4 Rhdf5lib_1.12.0 reprex_2.0.0
## [64] base64enc_0.1-3 png_0.1-7 viridisLite_0.3.0
## [67] bitops_1.0-7 R.oo_1.24.0 rhdf5filters_1.2.0
## [70] blob_1.2.1 jpeg_0.1-8.1 rstatix_0.7.0
## [73] DECIPHER_2.18.1 ggsignif_0.6.1 rtk_0.2.6.1
## [76] scales_1.1.1 memoise_2.0.0 magrittr_2.0.1
## [79] plyr_1.8.6 zlibbioc_1.36.0 compiler_4.0.5
## [82] philr_1.16.0 cli_2.5.0 ade4_1.7-16
## [85] patchwork_1.1.1 htmlTable_2.1.0 Formula_1.2-4
## [88] tidyselect_1.1.0 stringi_1.5.3 highr_0.9
## [91] yaml_2.2.1 askpass_1.1 locfit_1.5-9.4
## [94] latticeExtra_0.6-29 ggrepel_0.9.1 grid_4.0.5
## [97] sass_0.3.1 fastmatch_1.1-0 tools_4.0.5
## [100] rio_0.5.26 rstudioapi_0.13 foreach_1.5.1
## [103] foreign_0.8-81 gridExtra_2.3 farver_2.1.0
## [106] Rtsne_0.15 digest_0.6.27 rvcheck_0.1.8
## [109] BiocManager_1.30.12 quadprog_1.5-8 Rcpp_1.0.6
## [112] car_3.0-10 broom_0.7.6 httr_1.4.2
## [115] AnnotationDbi_1.52.0 colorspace_2.0-0 rvest_1.0.0
## [118] XML_3.99-0.5 fs_1.5.0 truncnorm_1.0-8
## [121] splines_4.0.5 tidytree_0.3.3 multtest_2.46.0
## [124] xtable_1.8-4 jsonlite_1.7.2 ggtree_2.4.0
## [127] corpcor_1.6.9 R6_2.5.0 Hmisc_4.5-0
## [130] NADA_1.6-1.1 pillar_1.6.0 htmltools_0.5.1.1
## [133] glue_1.4.2 fastmap_1.1.0 DT_0.18
## [136] codetools_0.2-18 utf8_1.2.1 bslib_0.2.4
## [139] curl_4.3 zip_2.1.1 openxlsx_4.2.3
## [142] openssl_1.4.3 survival_3.2-11 rmarkdown_2.7
## [145] biomformat_1.18.0 munsell_0.5.0 rhdf5_2.34.0
## [148] GenomeInfoDbData_1.2.4 iterators_1.0.13 haven_2.4.1
## [151] reshape2_1.4.4 gtable_0.3.0